# CpGs (e.g., from MethMatrix)cpgs <-colnames(MethMatrix)[-1] # "chr10_100371240", ...ann <-annotate_cpgs_biomart(cpgs, build ="GRCh38") # or "GRCh37"# Save as .txt file# write.table(ann, "/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/cpg_annotation.txt", sep = "\t", row.names = FALSE, quote = FALSE)
# Read the annotation filecpg_annotation_df <-read.table("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/final_files/cpg_annotation.txt", header =TRUE, sep ="\t") |>transmute(CpG = CpG_ID,Ensembl_ID = Ensembl_ID,Gene =case_when( Region =="Promoter"~coalesce(Promoter_gene, Gene_symbol), # fallback if missing Region =="GeneBody"~ Gene_symbol, Region =="Intergenic"~ Nearest_gene,TRUE~NA_character_ ) )
Traits
Background
Four vaccination strains:
Vic19 (A/Victoria/2570/2019) - H1N1
Tas20 (A/Tasmania/503/2020) - H3N2
Phu13 (B/Phuket/3073/2013) - B-Victoria
Wa19 (B/Washington/2/2019) - B-Yamagata
To enhance analysis of immune response, I derived three new predictors from the raw hemagglutination inhibition (HAI) titre values across four vaccine strains:
The first predictor, log_d0_sp_sum, represents the natural logarithmic sum of HAI titre values at day 0 for each strain, indicating baseline seroprotection levels.
The third predictor, day28_sc_foldchange, quantifies the fold change in seroprotection between day 0 and day 28, calculated as the ratio of seroprotection levels at day 28 to those at day 0.
traits <-read_csv("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/Traits_HumanBlood256_Reed2023-9206_021624mjt_HP(revised).csv") |>filter(sampleID %in% study_cohort) # calculate the sum of seroprotection titers at day 0 and day 28traits <- traits |>mutate(d0_sp_sum = d0_titre_vic19 + d0_titre_tas20 + d0_titre_phu13 + d0_titre_wa19) |>mutate(d28_sp_sum = d28_titre_vic19 + d28_titre_tas20 + d28_titre_phu13 + d28_titre_wa19) |>mutate(log_d0_sp_sum =log(d0_titre_vic19 + d0_titre_tas20 + d0_titre_phu13 + d0_titre_wa19)) |>mutate(log_d28_sp_sum =log(d28_titre_vic19 + d28_titre_tas20 + d28_titre_phu13 + d28_titre_wa19)) |>mutate(d28_sc_foldchange = d28_sp_sum / d0_sp_sum) |>mutate(log_d28_sc_foldchange =log(d28_sc_foldchange)) # Read deconvoluted cell types originated from methylation data by CellFicelltype_methylation <-read_delim("/Users/jacenai/Desktop/Matteo_Lab/UGA6/methylation/cell_type_specific_methy.txt",delim ="\t",col_names =TRUE) |>as_tibble() |> dplyr::select(-Residual) |> dplyr::rename(sampleID =`Samples`) |># change the first _ of column `Samples` to - dplyr::mutate(sampleID =sub("_", "-", sampleID)) |>filter(sampleID %in% study_cohort) # Run PCA on Cell type proportioncelltype_methylation_pca <- celltype_methylation |> dplyr::select(-sampleID) |>prcomp(center =TRUE,scale =FALSE) |># extract the first principal component because it explains 60% of the variancepredict(celltype_methylation) |>as_tibble() |> dplyr::select(PC1) |>mutate(sampleID = celltype_methylation$sampleID) |>rename_with(~str_replace(., "PC", "Cell_Type_PC"), everything())traits <- traits |>left_join(celltype_methylation_pca, by ="sampleID")# small the traits to only the columns we needtraits <- traits |> dplyr::select("sampleID","Cell_Type_PC1","log_d0_sp_sum","log_d28_sp_sum","d28_sc_foldchange","seroresponse_3grp","seroresponse_2grp","age","bmi","sex","ancestry",# "plate","vaccine","plate" )
Imputation
Impute missing values in traits
# Check missing values of the data for traitscolSums(is.na(traits)) |># pick out the columns with missing valuesas.data.frame() |>filter(colSums(is.na(traits)) >0)
colSums(is.na(traits))
bmi 4
# Check missing values of the data for MethMatrixcolSums(is.na(MethMatrix)) |>as.data.frame() |>filter(colSums(is.na(MethMatrix)) >0)
Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
tl.srt, : "upper" is not a graphical parameter
Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
tl.col, : "upper" is not a graphical parameter
Warning in title(title, ...): "upper" is not a graphical parameter
# use this!ggpairs(raw_traits,title ="Pairwise plot of raw traits") %>%suppressMessages()
Based on EDA, we finally include age, sex, Cell_Type_PC1, bmi, ancestry, vaccine, log_d0_sp_sum, log_d28_sp_sum, and d28_sc_foldchange as predictors in the model.
Sequential Orthogonalization
Problem
In the linear model \(\mathbf{Y} = \beta_0 \mathbf{1}_n + \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\varepsilon}\), where \(\mathbf{Y} \in \mathbb{R}^n\) is the response vector, \(\mathbf{X} = [\mathbf{X}_1,\dots,\mathbf{X}_p] \in \mathbb{R}^{n \times p}\) is the design matrix of predictors, \(\boldsymbol{\beta} \in \mathbb{R}^p\) is the coefficient vector, and \(\boldsymbol{\varepsilon} \in \mathbb{R}^n\) is the error term. Identifiability of the regression coefficients requires that \(\mathbf{X}\) have full column rank, i.e., \(\operatorname{rank}(\mathbf{X}) = p\). When predictors are collinear, \(\mathbf{X}^\top \mathbf{X}\) becomes singular or nearly singular, and the ordinary least squares estimator \(\hat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1}\mathbf{X}^\top \mathbf{Y}\) is either undefined or numerically unstable. In such cases, identifiability of the individual coefficients fails, since whenever \(\operatorname{rank}(\mathbf{X}) < p\), there exist distinct coefficient vectors \(\boldsymbol{\beta}, \boldsymbol{\beta}' \in \mathbb{R}^p\) with \(\boldsymbol{\beta} \neq \boldsymbol{\beta}'\) such that \(\mathbf{X}\boldsymbol{\beta} = \mathbf{X}\boldsymbol{\beta}'\), and hence the fitted values \(\hat{\mathbf{Y}} = \mathbf{X}\boldsymbol{\beta}\) are not uniquely determined by the coefficients. This issue arises frequently in DNA methylation studies. For example, when modeling methylation levels as the response with age and immune response variables as predictors, age is often strongly correlated with both methylation levels (e.g., through epigenetic clocks) and immune response measures. Such correlations induce collinearity in the design matrix, reducing its effective rank and making it difficult to disentangle the effects of immune response variables on methylation from confounding by age.
Proposed Solution
To restore identifiability, we propose a sequential orthogonalization procedure inspired by the Gram–Schmidt process. For each predictor \(\mathbf{X}_j \in \mathbb{R}^n\), define \[
\mathbf{X}_j^\perp = \mathbf{X}_j - \mathcal{P}_{\{\mathbf{X}_1^\perp,\dots,\mathbf{X}_{j-1}^\perp\}}(\mathbf{X}_j),
\] where \(\mathcal{P}_{\{\mathbf{X}_1^\perp,\dots,\mathbf{X}_{j-1}^\perp\}}(\mathbf{X}_j)\) denotes the projection of \(\mathbf{X}_j\) onto the span of the previously orthogonalized predictors. By construction, the transformed design matrix \(\mathbf{X}^\perp = \big(\mathbf{X}_1^\perp, \dots, \mathbf{X}_p^\perp\big)\) has orthogonal columns, i.e., \((\mathbf{X}^\perp)^\top \mathbf{X}^\perp\) is diagonal. This ensures full column rank and yields identifiable coefficient estimates that represent the unique contribution of each predictor to the response.
Gram-Schmidt Process
The Gram-Schmidt process is an algorithm used to convert a set of linearly independent vectors into an orthonormal basis. Given a set of vectors \(\mathbf{a}_1, \mathbf{a}_2, \ldots, \mathbf{a}_k \in \mathbb{R}^n\), the Gram-Schmidt process constructs an orthonormal set \(\mathbf{q}_1, \mathbf{q}_2, \ldots, \mathbf{q}_k\) such that:
Each vector \(\mathbf{q}_i\) is orthogonal to all preceding vectors \(\mathbf{q}_j\) for \(j < i\).
Each vector \(\mathbf{q}_i\) has unit length, i.e., \(|\mathbf{q}_i| = 1\).
This process is fundamental in linear algebra for applications that require orthogonal bases, such as QR decomposition and numerical stability improvements in computational algorithms. The process can be summarized as follows:
Let \(\mathbf{a}_1\) and \(\mathbf{a}_2\) be two linearly independent vectors in \(\mathbb{R}^2\). The Gram-Schmidt process for these vectors proceeds as follows:
Orthogonalize \(\mathbf{a}_1\):
Set \(\tilde{\mathbf{q}}_1 = \mathbf{a}_1\).
Normalize \(\tilde{\mathbf{q}}_1\):
Define \(\mathbf{q}_1 = \frac{\tilde{\mathbf{q}}_1}{|\tilde{\mathbf{q}}_1|}\), making \(\mathbf{q}_1\) a unit vector in the direction of \(\tilde{\mathbf{q}}_1\).
Orthogonalize \(\mathbf{a}_2\):
Remove the component of \(\mathbf{a}_2\) in the direction of \(\mathbf{q}_1\) by defining \(\tilde{\mathbf{q}}_2 = \mathbf{a}_2 - (\mathbf{q}_1 \cdot \mathbf{a}_2) \mathbf{q}_1\).
Normalize \(\tilde{\mathbf{q}}_2\):
Define \(\mathbf{q}_2 = \frac{\tilde{\mathbf{q}}_2}{|\tilde{\mathbf{q}}_2|}\), creating a unit vector orthogonal to \(\mathbf{q}_1\).
These steps ensure that the resulting set \({\mathbf{q}_1, \mathbf{q}_2}\) forms an orthonormal basis for the span of \({\mathbf{a}_1, \mathbf{a}_2}\).
Below, we provide R code to visualize each step in the Gram-Schmidt process for two vectors \(\mathbf{a}_1 = (0.5, 2)\) and \(\mathbf{a}_2 = (1.5, 0.5)\).
The orthogonalization process sequence matters. The sequence is:
age -> sex -> Cell_Type_PC1 -> bmi -> ancestry -> vaccine -> log_d28_sp_sum -> log_d0_sp_sum -> d28_sc_foldchange.
And I didn’t normalize the orthogonalized factors.
Implementation of sequential orthogonalization
# adjust the traitsorthogonalize <-function(df, traits) {# Initialize a list to hold the orthogonalized predictors orthogonalized <- df# Loop over each trait to orthogonalizefor (i inseq_along(traits)) { current_trait <- traits[i]if (i ==1) next# Skip the first trait because it has no predictors before it# Regress the current trait on all previous traits model <-lm(as.formula(paste(current_trait, "~", paste(traits[1:(i-1)], collapse ="+"))), data = orthogonalized)# Replace the current trait with its residuals orthogonalized[[current_trait]] <-residuals(model) }return(orthogonalized)}
Sequential orthogonalization relies on the sequence of traits provided. Here, I will orthogonalize the traits in the order of age, sex, Cell_Type_PC1, bmi, ancestry, vaccine, log_d28_sp_sum, log_d0_sp_sum, and d28_sc_foldchange because I want to remove the effects of the previous traits on the three immune response traits (log_d28_sp_sum, log_d0_sp_sum, and d28_sc_foldchange).
interested_traits <-c("age","sex","Cell_Type_PC1","bmi","ancestry","vaccine","log_d28_sp_sum","log_d0_sp_sum","d28_sc_foldchange" )# orthogonalize the predictorsorthogonalized_traits_PCA <-orthogonalize(traits[interested_traits], interested_traits) # use this!ggpairs(orthogonalized_traits_PCA,title ="Pairwise Plots of Orthogonalized Traits")
EDA after Sequential Orthogonalization
Plot the PCA plot for each variable
# do PCA on the methylation dataMethMatrix_1_pca <-prcomp(MethMatrix %>%as.data.frame() %>%column_to_rownames(var ="sampleID"),scale =FALSE, center =TRUE)# calculate the percentage of variance explained by each principal componentround(MethMatrix_1_pca$sd^2/sum(MethMatrix_1_pca$sd^2) *100, 2)
# extract the first two principal componentsMethMatrix_1_pca_df <-as.data.frame(MethMatrix_1_pca$x) %>% dplyr::select(PC1, PC2) %>%rownames_to_column(var ="sampleID") %>%left_join(traits, by ="sampleID") %>%# if ancetry = 0, label as "white"; if ancetry = 1, label as "non-white"mutate(ancestry =ifelse(ancestry ==0, "white", "non-white")) %>%mutate(ancestry =as.factor(ancestry)) %>%mutate(sex =as.factor(sex)) %>%mutate(sex =ifelse(sex ==0, "male", "female")) %>%mutate(vaccine =as.factor(vaccine)) %>%# mutate(plate = as.factor(plate)) %>%mutate(vaccine =ifelse(vaccine ==0, "standard dose", "high dose"))# Function to create PCA plot with specified aesthetics for Nature/Science styleplot_pca <-function(data, variable) {# Determine if the variable is continuous or discrete is_continuous <-is.numeric(data[[variable]])# Set up color scale based on variable type color_scale <-if (is_continuous) {scale_color_gradientn(colors =c("#1f78b4", "#fdbf6f", "#d7191c")) # for continuous } else {scale_color_manual(values =c("#1f78b4", "#d7191c", "#33a02c", "#ff7f00")) # customize for up to 4 discrete levels }# Create plot with appropriate color scale and add a title for the variable p <-ggplot(data, aes(x = PC1, y = PC2, color = .data[[variable]])) +geom_point(size =1.8, alpha =0.8) + color_scale +labs(x =paste("PC1 (", round(MethMatrix_1_pca$sdev[1]^2/sum(MethMatrix_1_pca$sdev^2) *100, 2), "%)", sep =""),y =paste("PC2 (", round(MethMatrix_1_pca$sdev[2]^2/sum(MethMatrix_1_pca$sdev^2) *100, 2), "%)", sep =""),title = variable, # Add variable name as the titlecolor = variable ) +theme_classic(base_size =14) +theme(axis.text =element_text(color ="black"),axis.title =element_text(),legend.title =element_blank(),legend.position ="right",plot.title =element_text(hjust =0.5), # Center and bold the titlepanel.border =element_rect(color ="black", fill =NA, size =0.7),panel.grid.major =element_line(color ="gray90", size =0.3),panel.grid.minor =element_blank() )return(p)}# List of variables to plotvariables <-c("log_d0_sp_sum", "log_d28_sp_sum", "d28_sc_foldchange","age", "bmi", "Cell_Type_PC1", "sex", "ancestry", "vaccine")# Generate PCA plots for each variable with titlesplots <-lapply(variables, function(var) plot_pca(MethMatrix_1_pca_df, var))
Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
# Combine plots into a grid layout with variable names as titlescombined_plot <-wrap_plots(plots, ncol =3,axis_titles ="collect")# Display the combined plotprint(combined_plot)
# save the plot# ggsave("PCA_plot.png", combined_plot, width = 12, height = 8, units = "in")
Multiple Linear Regression Model (LOOCV)
In this analysis, we consider the following linear model for methylation data:
\(\mathbf{M} \in \mathbb{R}^{n \times p}\) is the methylation matrix (response),
\(\mathbf{X} \in \mathbb{R}^{n \times k}\) is the matrix of orthogonalized predictors (e.g., latent factors),
\(\boldsymbol{\beta} \in \mathbb{R}^{k \times p}\) is the matrix of regression coefficients, and
\(\boldsymbol{\epsilon} \in \mathbb{R}^{n \times p}\) is the matrix of error terms.
Leave-one-out cross-validation (LOOCV) is used to select the best predictive model.
We assume the Gauss–Markov conditions hold, ensuring the least squares estimator \(\hat{\boldsymbol{\beta}}\) is the best linear unbiased estimator (BLUE):
We will fit a multivariate linear regression model to predict the methylation levels using the orthogonalized predictors. We will use leave-one-out cross-validation (LOOCV) to select the best model with the lowest root mean squared error (RMSE).
# Define the multivariate model functionmethylation_orthogonalized_continuous_model <-function(MethMatrix, orthogonalized_traits) { best_rmse <-Inf# Initialize with a large numberfor (i in1:nrow(MethMatrix)) {# Create training and validation sets train_MethMatrix <- MethMatrix[-i,] train_traits <- orthogonalized_traits[-i,] validation_MethMatrix <- MethMatrix[i,, drop =FALSE] validation_traits <- orthogonalized_traits[i,, drop =FALSE]# Fit the model with continuous predictors and factors model <-lm(as.matrix(train_MethMatrix) ~ log_d0_sp_sum + log_d28_sp_sum + d28_sc_foldchange + age + bmi + Cell_Type_PC1 + vaccine + ancestry + sex, data = train_traits)# Predict on the left-out observation predicted_on_validation <-predict(model, newdata = validation_traits)# Calculate RMSE for the prediction current_rmse <-sqrt(mean((as.numeric(validation_MethMatrix) - predicted_on_validation)^2))# Update the best model if the current RMSE is lowerif (current_rmse < best_rmse) { best_rmse <- current_rmse best_model <- model } }return(list(best_model = best_model, best_rmse = best_rmse))}
Run the function with the input as MethMatrix_1 and orthogonalized_traits_PCA (which contains 6 list of orthogonalized traits)
MethMatrix values are beta values (ranging from 0 to 1), then RMSE = 0.0409 means the model’s predictions are off by about 4% methylation on average per site, which is usually considered quite accurate in epigenetic modeling.
Model Evaluation
Pseudoinverse of the Coefficient Matrix
Moore-Penrose Inverse Matrix
We aim to evaluate how well the estimated coefficient matrix \(\hat{\boldsymbol{\beta}}\) captures the relationship between the latent factor matrix \(\hat{\mathbf{X}}\) and the observed methylation matrix \(\mathbf{M}\). While the model is trained via:
our evaluation assumes the methylation matrix \(\mathbf{M}\) remains fixed (i.e., unchanged), and we aim to recover the factors \(\hat{\mathbf{X}}\) from the fitted model.
Assuming a perfect reconstruction under the fitted coefficients, we posit:
This formulation allows us to estimate \(\hat{\mathbf{X}}\) from observed methylation data and visualize its agreement with the true (or originally estimated) latent factors. A close match between \(\hat{\mathbf{X}}\) and \(\mathbf{X}\) would result in predicted values aligning along the identity line in a scatter plot.
Next, We will calculate the pseudoinverse of the coefficient matrix to predict the factors for plotting.
predict_factors <-function(model, meth_matrix) {# Obtain the coefficient matrix from the multivariate linear model coeff_matrix <-coef(model)# Calculate the pseudoinverse of the coefficient matrix pseudoinv_coeff <-ginv(coeff_matrix)# Predict factors predicted_factors <-as.matrix(meth_matrix) %*% pseudoinv_coeff %>%as_tibble() %>%set_names(rownames(coeff_matrix))return(predicted_factors)}
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
Predicted vs. the Original Factors
Calculate the predicted factors using the function predict_factors and plot the predicted vs actual factors using the function plot_predicted_vs_actual to all the 6 models.
We also interested in the which CpG sites are associated with the traits. Therefore, we will perform a EWAS analysis to identify the CpG sites that are highly associated with the traits. By the multivariate linear model above, we can get the coefficients and t-test’s p-values for each predictor in each CpG site. After adjusting for multiple testing for the p-values, we can identify the CpG sites that are significantly associated with the traits.
# Function to get model summary for a specific variableget_model_summary <-function(model_summary, variable) { model_summary |>filter(term ==unique(model_summary$term[grepl(variable, model_summary$term)])) |># Add the column to label the response variablemutate(variable = variable) |># mutate(pValue_adjusted = p.adjust(p.value, method = 'BH')) |> dplyr::select(response, estimate, std.error, statistic, p.value, variable) |>rename_with(~c("CpG_site", "Coefficient", "StdError", "tValue", "pValue", "label"), everything())}# model and tidy tablefit <-readRDS("8.13.25_methylation_orthogonalized_continuous_model.rds")# fit <- readRDS("1.24.25_methylation_orthogonalized_continuous_model.rds")tidy_tbl <-tidy(fit$best_model)# variables to extractvars <-c("age","sex","bmi","Cell_Type","ancestry","vaccine","log_d0_sp_sum","log_d28_sp_sum","d28_sc_foldchange")# one-liner to build the combined summaryewas_summary <-map_dfr(vars, ~get_model_summary(tidy_tbl, .x))
Here I use the Benjamini-Hochberg method to adjust the p-values. Because we have tremendous tests where some false positives are tolerable but controlling the overall error rate is key.